Wykorzystane biblioteki:
library(ggplot2) #Do tworzenia regularnych wykresów
library(plotly) #Do tworzenia interaktywnych wykresów
library(dplyr) #Do manipulacji na danych
library(magrittr) #Dla operatora %>%
library(caTools)
library(reshape2) #Dla funkcji 'melt' upraszczającej stworzenie heatmapy korelacji
Wymuszanie identycznych rezultatów dla kolejnych wywołań i wczytanie danych:
set.seed(2112) #Zadbanie o taki sam rezultat dla kolejnych wywołań
data <- read.csv('Life_Expectancy_Data.csv')
sum_nas <- function(x, na.rm){
sum(is.na(x))
}
statistics <- list(list('Nothing_to_see_here', "Column name"),
list(mean, "Mean"), list(sd, "Standard Deviation"),
list(min, "Minimum"), list(max, "Maximum"),
list(median, "Median"), list(sum_nas, "Amount of NaN"))
all_cols = list()
for (x in statistics){
all_cols[[length(all_cols)+1]] <- x[[2]]
}
df_summary <- data.frame(matrix(ncol = length(all_cols), nrow = 0))
colnames(df_summary) <- all_cols
cols <- ncol(data)
cat("Number of rows: ", nrow(data), "\nNumber of columns: ", cols, "\n")
## Number of rows: 2938
## Number of columns: 22
ite <- 1
for (x in data){
dist = " "
if (class(x) != 'factor'){
new_row <- list()
for (y in statistics){
if (y[[2]] == 'Column name')
new_row[[length(new_row)+1]] <- colnames(data)[ite]
else
new_row[[length(new_row)+1]] <- y[[1]](x, na.rm=TRUE)
}
df_summary[nrow(df_summary) + 1, ] <- new_row
}
else{
names <- unique(x)
cat('\nFactor column: ', colnames(data)[ite], ', Unique values: ', paste(shQuote(names, type="cmd"), collapse=", "), '\n')
}
ite <- ite+1
}
##
## Factor column: Country , Unique values: "Afghanistan", "Albania", "Algeria", "Angola", "Antigua and Barbuda", "Argentina", "Armenia", "Australia", "Austria", "Azerbaijan", "Bahamas", "Bahrain", "Bangladesh", "Barbados", "Belarus", "Belgium", "Belize", "Benin", "Bhutan", "Bolivia (Plurinational State of)", "Bosnia and Herzegovina", "Botswana", "Brazil", "Brunei Darussalam", "Bulgaria", "Burkina Faso", "Burundi", "Cote d'Ivoire", "Cabo Verde", "Cambodia", "Cameroon", "Canada", "Central African Republic", "Chad", "Chile", "China", "Colombia", "Comoros", "Congo", "Cook Islands", "Costa Rica", "Croatia", "Cuba", "Cyprus", "Czechia", "Democratic People's Republic of Korea", "Democratic Republic of the Congo", "Denmark", "Djibouti", "Dominica", "Dominican Republic", "Ecuador", "Egypt", "El Salvador", "Equatorial Guinea", "Eritrea", "Estonia", "Ethiopia", "Fiji", "Finland", "France", "Gabon", "Gambia", "Georgia", "Germany", "Ghana", "Greece", "Grenada", "Guatemala", "Guinea", "Guinea-Bissau", "Guyana", "Haiti", "Honduras", "Hungary", "Iceland", "India", "Indonesia", "Iran (Islamic Republic of)", "Iraq", "Ireland", "Israel", "Italy", "Jamaica", "Japan", "Jordan", "Kazakhstan", "Kenya", "Kiribati", "Kuwait", "Kyrgyzstan", "Lao People's Democratic Republic", "Latvia", "Lebanon", "Lesotho", "Liberia", "Libya", "Lithuania", "Luxembourg", "Madagascar", "Malawi", "Malaysia", "Maldives", "Mali", "Malta", "Marshall Islands", "Mauritania", "Mauritius", "Mexico", "Micronesia (Federated States of)", "Monaco", "Mongolia", "Montenegro", "Morocco", "Mozambique", "Myanmar", "Namibia", "Nauru", "Nepal", "Netherlands", "New Zealand", "Nicaragua", "Niger", "Nigeria", "Niue", "Norway", "Oman", "Pakistan", "Palau", "Panama", "Papua New Guinea", "Paraguay", "Peru", "Philippines", "Poland", "Portugal", "Qatar", "Republic of Korea", "Republic of Moldova", "Romania", "Russian Federation", "Rwanda", "Saint Kitts and Nevis", "Saint Lucia", "Saint Vincent and the Grenadines", "Samoa", "San Marino", "Sao Tome and Principe", "Saudi Arabia", "Senegal", "Serbia", "Seychelles", "Sierra Leone", "Singapore", "Slovakia", "Slovenia", "Solomon Islands", "Somalia", "South Africa", "South Sudan", "Spain", "Sri Lanka", "Sudan", "Suriname", "Swaziland", "Sweden", "Switzerland", "Syrian Arab Republic", "Tajikistan", "Thailand", "The former Yugoslav republic of Macedonia", "Timor-Leste", "Togo", "Tonga", "Trinidad and Tobago", "Tunisia", "Turkey", "Turkmenistan", "Tuvalu", "Uganda", "Ukraine", "United Arab Emirates", "United Kingdom of Great Britain and Northern Ireland", "United Republic of Tanzania", "United States of America", "Uruguay", "Uzbekistan", "Vanuatu", "Venezuela (Bolivarian Republic of)", "Viet Nam", "Yemen", "Zambia", "Zimbabwe"
##
## Factor column: Status , Unique values: "Developing", "Developed"
df_summary
## Column name Mean Standard Deviation Minimum Maximum Median Amount of NaN
## 1 Year 2.007519e+03 4.613841e+00 2000.00000 2.015000e+03 2.008000e+03 0
## 2 Life.expectancy 6.922493e+01 9.523867e+00 36.30000 8.900000e+01 7.210000e+01 10
## 3 Adult.Mortality 1.647964e+02 1.242921e+02 1.00000 7.230000e+02 1.440000e+02 10
## 4 infant.deaths 3.030395e+01 1.179265e+02 0.00000 1.800000e+03 3.000000e+00 0
## 5 Alcohol 4.602861e+00 4.052413e+00 0.01000 1.787000e+01 3.755000e+00 194
## 6 percentage.expenditure 7.382513e+02 1.987915e+03 0.00000 1.947991e+04 6.491291e+01 0
## 7 Hepatitis.B 8.094046e+01 2.507002e+01 1.00000 9.900000e+01 9.200000e+01 553
## 8 Measles 2.419592e+03 1.146727e+04 0.00000 2.121830e+05 1.700000e+01 0
## 9 BMI 3.832125e+01 2.004403e+01 1.00000 8.730000e+01 4.350000e+01 34
## 10 under.five.deaths 4.203574e+01 1.604455e+02 0.00000 2.500000e+03 4.000000e+00 0
## 11 Polio 8.255019e+01 2.342805e+01 3.00000 9.900000e+01 9.300000e+01 19
## 12 Total.expenditure 5.938190e+00 2.498320e+00 0.37000 1.760000e+01 5.755000e+00 226
## 13 Diphtheria 8.232408e+01 2.371691e+01 2.00000 9.900000e+01 9.300000e+01 19
## 14 HIV.AIDS 1.742103e+00 5.077785e+00 0.10000 5.060000e+01 1.000000e-01 0
## 15 GDP 7.483158e+03 1.427017e+04 1.68135 1.191727e+05 1.766948e+03 448
## 16 Population 1.275338e+07 6.101210e+07 34.00000 1.293859e+09 1.386542e+06 652
## 17 thinness..1.19.years 4.839704e+00 4.420195e+00 0.10000 2.770000e+01 3.300000e+00 34
## 18 thinness.5.9.years 4.870317e+00 4.508882e+00 0.10000 2.860000e+01 3.300000e+00 34
## 19 Income.composition.of.resources 6.275511e-01 2.109036e-01 0.00000 9.480000e-01 6.770000e-01 167
## 20 Schooling 1.199279e+01 3.358920e+00 0.00000 2.070000e+01 1.230000e+01 163
Zadbanie o to, aby wykresy były niesamowicie eleganckie:
options(repr.plot.width=24, repr.plot.height=16)
Tworzenie eleganckich wykresów rozkładu zmiennych: warto zwrócić uwagę na wykorzystanie density-plotu dla danych ciągłych i barplota dla danych dyskretnych. Uwzględniono - dla kompletności - rozkład lat i krajów.
for (x in colnames(data)){
if (is.numeric(data[[x]]) && x!='Year')
print(data %>%
filter(!is.na(data[[x]])) %>%
ggplot(aes_string(x=x)) + geom_density(fill="#69b3a2", color="#e9ecef", alpha=0.8))
else
print(data %>%
filter(!is.na(data[[x]])) %>%
ggplot(aes_string(x=x)) + geom_bar() +
theme(axis.text.x = element_text(angle = 90, vjust = 0.5, size = 6, hjust = 1)))
}
Dodatkowe wykresy prezentujący korelacje pomiędzy Life.expectancy a pozostałymi zmiennymi: pozwalają one zauważyć, że dla pewne wartości mogły nie zostać poprawnie udokumentowane
for (x in colnames(data)){
if (x == 'Life.expectancy')
next
plot_data <- data %>% filter(!is.na(data[[x]]) & !is.na(data[['Life.expectancy']]))
plot_itself <- plot_data %>% ggplot(aes_string(x=x, y='Life.expectancy')) + geom_point()
if (x == 'Country')
plot_itself <- plot_itself + theme(axis.text.x = element_text(angle = 90, vjust = 0.5, size = 6, hjust = 1))
print(plot_itself)
}
Heatmapa przedstawiająca korelacje między zmiennymi:
cor_matrix <- data %>% select(-c(Country, Status)) %>% cor(use = "complete.obs") %>% round(2)
melt_matrix <- melt(cor_matrix, na.rm = TRUE)
#Wzorzec I: https://r-charts.com/correlation/heat-map-ggplot2/
#Wzorzec II: http://sthda.com/english/wiki/ggplot2-quick-correlation-matrix-heatmap-r-software-and-data-visualization
ggplot(data = melt_matrix, aes(Var2, Var1, fill = value))+
geom_tile()+
scale_fill_gradient2(low = "blue", high = "red", mid = "white",
midpoint = 0, limit = c(-1,1), space = "Lab",
name="Pearson\nCorrelation") +
theme(axis.text.x = element_text(angle = 45, vjust = 1,
size = 5, hjust = 1))+
geom_text(aes(label = value), color = 'black', size = 1.5)+
coord_fixed()
Preparacja danych do stworzenia interaktywnego wykresu długości życia w funkcji kraju i roku:
unique_countries <- unique(data$Country)
buttons_countries <- list()
for (i in seq_along(unique_countries)){
buttons_countries[[length(buttons_countries)+1]] <- list(method = "restyle",
args = list("transforms[0].value", unique_countries[i]),
label = unique_countries[i])
}
Wykres długości życia w funkcji kraju i roku:
#Wzorzec: https://stackoverflow.com/questions/63906441/plotly-r-using-colour-and-transformation-with-a-line-plot
p <- data %>% filter(!is.na(Life.expectancy)) %>%
plot_ly(
type = 'scatter',
x = ~Year,
y = ~Life.expectancy,
mode = 'markers',
transforms = list(
list(
type = 'filter',
target = ~Country,
operation = '=',
value = unique_countries[1]
)
)) %>% layout(
updatemenus = list(
list(
type = 'dropdown',
active = 0,
buttons = buttons_countries
)
)
)
p
Preparacja danych dla regresora
change <- function(data, name, borderline, multiplier) {
data[[name]][data[[name]] < borderline & !is.na(data[[name]])] <- data[[name]][data[[name]] < borderline & !is.na(data[[name]])]*multiplier
data
}
data <- data %>% filter(!is.na(Life.expectancy))
data <- change(data, 'BMI', 8.5, 10)
data <- change(data, 'Diphtheria', 10, 10)
for (x in colnames(data)){
if (data %>% filter(is.na(data[[x]])) %>% nrow == 0)
next
data[[paste(x, '_na', sep='')]] <- rep(0, nrow(data))
data[[paste(x, '_na', sep='')]][is.na(data[[x]])] <- 1
mid = data[[x]]
data[[x]][is.na(data[[x]])] <- median(data[[x]], na.rm=TRUE)
}
Usunięcie powtarzających się kolumn:
marked = 'Q'
while (marked != '-'){
marked='-'
for (x in colnames(data)){
for (y in colnames(data)){
if (!is.numeric(data[[x]]) || !is.numeric(data[[y]]) || x==y)
next
summa = ifelse (data[[x]] == data[[y]], 1, 0)
if (sum(summa) == nrow(data)){
marked = y
break
}
}
if (marked!='-') break
}
if (marked!='-')
data <- data %>% select(-c(y))
}
Wykonanie regresora:
tmp_data= sample.split(data, SplitRatio = 0.3)
train = subset(data,tmp_data==TRUE)
test = subset(data,tmp_data==FALSE)
lm_death <- train %>% select(-c(Country)) %>% lm(formula=Life.expectancy ~ .)
summary(lm_death)
##
## Call:
## lm(formula = Life.expectancy ~ ., data = .)
##
## Residuals:
## Min 1Q Median 3Q Max
## -12.9441 -2.1448 0.0542 2.1618 11.9756
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 8.553e+01 6.713e+01 1.274 0.20299
## Year -1.755e-02 3.349e-02 -0.524 0.60048
## StatusDeveloping -2.013e+00 4.603e-01 -4.372 1.38e-05 ***
## Adult.Mortality -1.791e-02 1.423e-03 -12.584 < 2e-16 ***
## infant.deaths 7.171e-02 1.570e-02 4.566 5.70e-06 ***
## Alcohol 5.769e-02 4.453e-02 1.295 0.19552
## percentage.expenditure 2.155e-04 1.464e-04 1.472 0.14142
## Hepatitis.B -2.755e-02 6.614e-03 -4.166 3.42e-05 ***
## Measles -1.281e-06 1.528e-05 -0.084 0.93322
## BMI 9.986e-02 1.337e-02 7.468 2.03e-13 ***
## under.five.deaths -5.317e-02 1.141e-02 -4.661 3.64e-06 ***
## Polio 1.001e-02 7.736e-03 1.293 0.19624
## Total.expenditure -9.760e-03 5.727e-02 -0.170 0.86471
## Diphtheria 1.308e-01 1.370e-02 9.548 < 2e-16 ***
## HIV.AIDS -4.367e-01 3.180e-02 -13.735 < 2e-16 ***
## GDP 7.766e-06 2.270e-05 0.342 0.73234
## Population -2.325e-09 2.538e-09 -0.916 0.35984
## thinness..1.19.years -6.290e-03 7.931e-02 -0.079 0.93680
## thinness.5.9.years 8.942e-02 7.763e-02 1.152 0.24968
## Income.composition.of.resources 5.463e+00 1.026e+00 5.324 1.30e-07 ***
## Schooling 4.832e-01 8.226e-02 5.874 6.10e-09 ***
## Alcohol_na 2.167e+00 1.414e+00 1.533 0.12573
## Hepatitis.B_na 7.334e-01 4.227e-01 1.735 0.08309 .
## BMI_na -3.401e+00 1.314e+00 -2.588 0.00981 **
## Polio_na 2.695e+00 2.095e+00 1.286 0.19873
## Total.expenditure_na -6.164e-01 1.283e+00 -0.481 0.63095
## GDP_na -1.042e+00 6.542e-01 -1.592 0.11173
## Population_na 1.242e+00 5.450e-01 2.280 0.02288 *
## Income.composition.of.resources_na -1.087e+00 7.572e-01 -1.436 0.15135
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 3.678 on 848 degrees of freedom
## Multiple R-squared: 0.8578, Adjusted R-squared: 0.8531
## F-statistic: 182.7 on 28 and 848 DF, p-value: < 2.2e-16
pred <- predict(lm_death, test)
Wartość \(rmse\):
sqrt(mean((pred - test$Life.expectancy)^2))
## [1] 3.853165